Built with R version:
3.4.2
Built with R version:
3.4.2
Load necessary libraries
library(affy)
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colMeans,
## colnames, colSums, do.call, duplicated, eval, evalq, Filter,
## Find, get, grep, grepl, intersect, is.unsorted, lapply,
## lengths, Map, mapply, match, mget, order, paste, pmax,
## pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
## rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
## tapply, union, unique, unsplit, which, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 1.17.1
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://bioconductor.org/packages/ComplexHeatmap/
##
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
## ========================================
library(plot3D)
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(circlize)
## ========================================
## circlize version 0.4.3
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: http://jokergoo.github.io/circlize_book/book/
##
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
## in R. Bioinformatics 2014.
## ========================================
library(AnnotationDbi)
## Loading required package: stats4
## Loading required package: IRanges
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:gplots':
##
## space
## The following object is masked from 'package:base':
##
## expand.grid
library(limma)
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
library(lattice)
library(org.Hs.eg.db)
##
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:AnnotationDbi':
##
## select
library(RColorBrewer)
library(AnnotationDbi)
library(rglwidget)
## The functions in the rglwidget package have been moved to rgl.
library(VennDiagram)
## Loading required package: futile.logger
library(org.Hs.eg.db)
library(GenomicRanges)
## Loading required package: GenomeInfoDb
library(GenomicFeatures)
library(rtracklayer)
library(biomaRt)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
##
## expand
## Loading required package: foreach
## Loaded glmnet 2.0-16
library(survival)
library(Hmisc)
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:AnnotationDbi':
##
## contents
## The following object is masked from 'package:Biobase':
##
## contents
## The following objects are masked from 'package:base':
##
## format.pval, units
library(ConsensusClusterPlus)
library(pheatmap)
library(ggplot2)
library(heatmap.plus)
library(rgl)
library("tmod")
set1 = c(brewer.pal(9,"Set1"), brewer.pal(8, "Dark2"))
violinJitter <- function(x, magnitude=1){
d <- density(x)
data.frame(x=x, y=runif(length(x),-magnitude/2, magnitude/2) * approxfun(d$x, d$y)(x))
}
rotatedLabel <- function(x0 = seq_along(labels), y0 = rep(par("usr")[3], length(labels)), labels, pos = 1, cex=1, srt=45, ...) {
w <- strwidth(labels, units="user", cex=cex)
h <- strheight(labels, units="user",cex=cex)
u <- par('usr')
p <- par('plt')
f <- par("fin")
xpd <- par("xpd")
par(xpd=NA)
text(x=x0 + ifelse(pos==1, -1,1) * w/2*cos(srt/360*2*base::pi), y = y0 + ifelse(pos==1, -1,1) * w/2 *sin(srt/360*2*base::pi) * (u[4]-u[3])/(u[2]-u[1]) / (p[4]-p[3]) * (p[2]-p[1])* f[1]/f[2] , labels, las=2, cex=cex, pos=pos, adj=1, srt=srt,...)
par(xpd=xpd)
}
avefc = function (y, log=TRUE, replace= FALSE) {
if (log) y = 2^y
if (replace) y = y + (1-min(y))
m = apply(y,1,mean)
y.n = y/m
y.n2 = y.n
y.n2 [y.n2 < 1] = 1/ (y.n2 [y.n2 < 1])
ave.fc = apply (y.n2, 1, mean)
return(ave.fc)
}
For gene convertion from array to HUGO
ensembl = useMart( "ensembl", dataset = "hsapiens_gene_ensembl" )
Upload or generate GEP normalized matrix
### choice 1: import processed matrix
setwd("~/Desktop/PTCL/GEP_PTCL_NOS/afinal_data_set/files/")
load("541_PTCL_batch_adjusted_geo.id.Rdata")
geneExpr = adj.data
# import batch and re-order accordingly
load("PTCL.batch.Rdata")
batch = batch [order(batch$nameNEW),]
batch.series = as.vector(batch$center)
### end of choice 1
### choice 2: generate your own affy object and custom data
# download CEL files from GEO series GSE6338, GSE19067, GSE19069, GSE40160, GSE58445, GSE65823 and EBI series ETABM702, ETABM783
# GSM368580.CEL, GSM368582.CEL, GSM368584.CEL, GSM368586.CEL, GSM368589.CEL, GSM368591.CEL, GSM368594.CEL, GSM472164.CEL, GSM1411278.CEL, GSM1411284.CEL, GSM1411285.CEL, GSM1411287.CEL, GSM1411355.CEL, GSM1411364.CEL, GSM1411368.CEL, GSM1411425.CEL, GSM1411427.CEL excluded from the analysis (see Methods for explaination")
### celfiles <- dir("~/Documents/DATI/PTCL.nos/GSE6338-GSE19067-GSE19069-GSE40160-GSE58445-GSE65823-ETABM702-ETABM783/", pattern = ".CEL")
### library(affy)
### gset = justRMA(celfile.path = "/Users/emagene/Documents/DATI/PTCL.nos/GSE6338-GSE19067-GSE19069-GSE40160-GSE58445-GSE65823-ETABM702-ETABM783/", ### filenames = celfiles, sampleNames = gsub(".CEL","", celfiles), cdfname = "hgu133plus2hsentrezgcdf")
### geneExpr = exprs(gset)
### batch adjustment
### library(sva)
### # import batch and re-order accordingly
### load("./Rmd.files/PTCL.batch.Rdata")
### batch = batch [order(rownames(batch)),]
### batch.series = as.vector(batch$center)
### geneExprNEW = geneExpr [ , order(colnames(geneExpr)) ]
### geneExprNEW = geneExprNEW[grep("AFFX",rownames(geneExprNEW), invert=TRUE),]
### # check order correspondence and, if correct, adjust data
### if (all(colnames(geneExprNEW) == rownames(batch))) {
### adj.data = ComBat (geneExprNEW, batch.series, mod = NULL, par.prior = TRUE, prior.plots = TRUE)
### } else {
### cat("Error: colnames and batch did not correspond")
### }
### geneExpr = adj.data
### colnames(geneExpr) = as.vector(batch$nameNEW)
### end of choice 2
Upload paz info with clinical and mutational data
pts.info.data <- read.table("~/Desktop/PTCL/GEP_PTCL_NOS/sept_2017_dropbox_luca/script (1)/Rmd.files/541_paz_info_MUT.txt", sep="\t", header=TRUE, check.names=FALSE, stringsAsFactors = F)
# customize colors for categories
levels(as.factor(pts.info.data$final.molec))
## [1] "AITL" "ALCL.neg" "ALCL.pos" "ATLL" "NKT" "PTCL.nos"
## [7] "T.CD30" "T.CD4" "T.CD8" "T.DR" "T.reg" "TCR-HL"
# "AITL" "ALCL.neg" "ALCL.pos" "ATLL" "NKT" "PTCL.nos" "T.CD30" "T.CD4" "T.CD8" "T.DR" "T.reg" "TCR-HL"
colorz = c("black", "yellow","dodgerblue2","brown2","darkorchid1", "orange", "grey42", "grey52","grey62","grey72","grey82","grey92")
temp = split ( pts.info.data$sample.nameNEW, pts.info.data$final.molec )
colorx = colnames(geneExpr)
length(colorz)
## [1] 12
length(temp)
## [1] 12
for (i in 1:length(colorz)) colorx [ which(colorx %in% unlist(temp[i])) ] = colorz[i]
library(gplots)
colorx = col2hex(colorx)
slices <- table(pts.info.data$final.molec)
lbls <- names(table(pts.info.data$final.molec))
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, ": ", slices, " (", pct, "%)", sep="" ) # add percents to labels
par(mfrow=c(1,1))
par(mar=c(3,3,3,3), xpd=F)
pie(slices,labels = lbls, init.angle = 0, col=colorz, main="", cex=0.6, radius=0.8)
# apply variational filter
afc2 = avefc(geneExpr, log=TRUE, replace=FALSE)
data541exprs.vf = geneExpr [afc2 >= 2, ]
# retry PCA on shorted gene list
data541m = t(as.matrix(data541exprs.vf))
pca<-prcomp(data541m,scale=T)
# mfrow3d(nr = 1, nc = 1, sharedMouse = T)
# plot3d(pca$x,rgl.use=F,col=colorx,size=0.6,type="s")
# rglwidget()
mat = as.matrix(data541exprs.vf)
base_mean = rowMeans(mat)
mat_scaled = t(apply(mat, 1, scale))
types = pts.info.data$final.molec
color.annot = col2hex(colorz); names (color.annot)= names(temp)
ha = HeatmapAnnotation(df = data.frame(type = types) , col = list(type = c( color.annot ) ) )
ha@anno_list[[1]]@color_mapping@colors = col2hex(colorz)
names(ha@anno_list[[1]]@color_mapping@colors) = names(temp)
ht = Heatmap(mat_scaled, name = "expression", km = 7, clustering_method_columns = "ward.D", col = colorRamp2(c(-1, 0, 1), c("green", "white", "red")), top_annotation = ha, top_annotation_height = unit(4, "mm"), show_row_names = FALSE, show_column_names = FALSE)
column_order(ht)
## [1] 91 236 85 86 429 88 64 311 270 271 229 231 180 257 87 227 258
## [18] 304 309 436 431 234 148 147 31 29 450 281 500 521 499 273 272 269
## [35] 522 520 501 519 498 518 497 505 506 504 486 485 482 483 484 502 503
## [52] 524 527 529 528 530 532 531 525 526 523 434 61 182 176 63 83 66
## [69] 445 230 515 517 513 296 298 516 494 492 496 493 314 297 294 295 293
## [86] 289 292 512 507 511 510 509 291 290 514 508 448 447 444 440 396 397
## [103] 395 394 391 50 51 187 192 433 349 363 385 428 435 427 389 366 491
## [120] 401 334 459 392 402 388 457 463 446 380 351 474 372 376 489 487 537
## [137] 488 534 536 495 490 535 533 481 479 478 480 477 264 261 268 259 262
## [154] 253 267 265 266 263 65 254 256 255 318 308 320 319 315 324 326 313
## [171] 280 299 285 316 286 278 344 330 430 331 275 305 307 136 426 184 162
## [188] 167 422 421 469 341 245 160 179 153 443 449 181 149 368 186 164 141
## [205] 194 218 161 98 300 312 84 442 323 301 174 306 248 329 325 178 139
## [222] 145 142 144 185 157 249 172 177 158 183 150 170 171 152 131 134 137
## [239] 216 214 213 241 202 205 203 191 130 133 195 215 244 211 246 238 197
## [256] 224 226 225 169 538 168 539 540 219 223 220 221 222 206 204 232 242
## [273] 200 243 143 250 154 207 198 235 240 247 252 163 155 383 188 239 417
## [290] 276 406 93 303 339 109 53 124 82 26 32 23 22 20 189 18 398
## [307] 41 44 10 193 217 129 2 140 208 251 199 201 209 210 212 233 237
## [324] 288 284 420 439 274 352 287 441 328 283 282 321 317 310 279 322 332
## [341] 302 348 419 399 387 166 159 337 467 151 277 355 359 410 353 470 475
## [358] 466 175 370 42 411 374 393 404 327 456 94 432 541 156 173 146 165
## [375] 135 128 106 458 361 364 354 90 454 27 403 138 196 462 453 338 415
## [392] 407 379 371 464 461 110 79 121 45 405 425 96 102 424 423 360 452
## [409] 451 365 358 89 1 28 19 347 101 260 455 59 418 367 48 69 190
## [426] 333 132 17 378 350 468 346 9 414 409 413 412 408 116 70 122 123
## [443] 340 381 460 382 77 38 108 37 228 49 68 67 416 62 92 71 4
## [460] 103 126 36 120 127 56 74 13 12 72 104 105 100 21 30 25 33
## [477] 24 465 345 117 343 342 471 336 476 356 15 60 390 357 362 99 97
## [494] 437 39 78 438 80 375 40 113 112 52 58 400 57 55 54 76 11
## [511] 114 73 75 472 3 6 377 119 47 35 125 5 95 81 34 46 43
## [528] 16 14 386 8 473 335 111 384 7 118 373 369 107 115
rle.custom = function (a, logged2 = TRUE, file = NULL, colorbox= NULL, labels=NULL , legend = NULL ) {
a.m <- apply(a,1,median)
if (logged2) {
for (i in 1:dim(a)[2]) {
a [,i] <- a [,i] - a.m
}
} else {
for (i in 1:dim(a)[2]) {
a [,i] <- log (a [,i] / a.m )
}
}
# png(file,10240,3840)
# par(mar=c(10,4,6,2))
# boxplot (a, ylim= c(-5,5), outline=F, col=colorbox, xlab="pts", names=labels, las=2, cex.axis = 1.5, main="RLE", xlim = c(1,600), cex.main = 5 )
# legend("bottomright",legend = c(levels(as.factor(pts.info.data$final.molec))),
# fill = colorz, # 6:1 reorders so legend order matches graph
# title = "Legend",
# cex = 5)
# dev.off()
a.c = apply(a, 2, stats::quantile)
return(a.c)
}
rle.medians = rle.custom(geneExpr, colorbox=colorx, file="./RLE.541.png", labels=pts.info.data$sample.nameNEW )
plot(rle.medians[3,], type="l", xlab="pts", ylab="RLE median" )
Define design file and filter geneExpr for patients included in design data frame and
design <- pts.info.data[,c(1:2,6:8,14:17)]
rownames(design)<- design[,1]
design<- design[,-c(1:2)]
design<-na.omit(design) ### select onyl patients with all mutations data available (n=53)
design$age<- as.numeric(as.character(design$age))
design$age<- design$age - median(design$age)
design[design == "WT"] <- 0
design[design == "MUT"] <- 1
design$final.molec[design$final.molec=="AITL"] <- 0
design$final.molec[design$final.molec=="PTCL.nos"] <- 1
design$gender[design$gender=="M"] <- 1
design$gender[design$gender=="F"] <- 0
design$offset <- rep(1, nrow(design))
design<-design[,c(8,1:7)]
all(pts.info.data$sample.nameNEW == colnames(geneExpr)) ## check correspondence
## [1] TRUE
# geneExpr = geneExpr [ , order (pts.info.data$geo.id)] ### do only to set correspondence in case of custom procedure
# colnames(geneExpr) = pts.info.data$sample.nameNEW [ order (pts.info.data$geo.id)]
geneExpr2<- (geneExpr[, rownames(design)])
geneExpr2<- data.matrix(geneExpr2, rownames.force = NA)
design<- data.matrix(design, rownames.force = NA)
We use the lmFit function from the limma package. This comes with a whole series of powerful and reliable tests.
glm = lmFit(geneExpr2[,rownames(design)], design = design )
glm = eBayes(glm)
F.stat <- classifyTestsF(glm[,-1],fstat.only=TRUE)
glm$F <- as.vector(F.stat)
df1 <- attr(F.stat,"df1")
df2 <- attr(F.stat,"df2")
if(df2[1] > 1e6){
glm$F.p.value <- pchisq(df1*glm$F,df1,lower.tail=FALSE)
}else
glm$F.p.value <- pf(glm$F,df1,df2,lower.tail=FALSE)
set.seed(12345678)
rlm <- lmFit(geneExpr[,rownames(design)], apply(design, 2, sample))
rlm <- eBayes(rlm)
F.stat <- classifyTestsF(rlm[,-1],fstat.only=TRUE)
rlm$F <- as.vector(F.stat)
df1 <- attr(F.stat,"df1")
df2 <- attr(F.stat,"df2")
if(df2[1] > 1e6){
rlm$F.p.value <- pchisq(df1*rlm$F,df1,lower.tail=FALSE)
}else
rlm$F.p.value <- pf(rlm$F,df1,df2,lower.tail=FALSE)
F.stat <- classifyTestsF(glm[,2:5],fstat.only=TRUE)
df1 <- attr(F.stat,"df1")
df2 <- attr(F.stat,"df2")
F.p.value <- pchisq(df1*F.stat,df1,lower.tail=FALSE)
R.stat <- classifyTestsF(rlm[,2:5],fstat.only=TRUE)
Rall = 1 - 1/(1 + glm$F * (ncol(design)-1)/(nrow(design)-ncol(design)))
Rgenetics = 1 - 1/(1 + F.stat * 4/(nrow(design)-ncol(design)))
Pgenetics = 1 - 1/(1 + R.stat * 4/(nrow(design)-ncol(design)))
names(Rgenetics) <- names(Pgenetics) <- names(Rall) <- rownames(geneExpr)
par(bty="n", mgp = c(2,.33,0), mar=c(3,2.5,1,1)+.1, las=1, tcl=-.25, xpd=NA)
d <- density(Pgenetics,bw=1e-3)
f <- 40 #nrow(gexpr)/512
par(mfrow=c(1,1))
par(mar=c(8,5,5,5), xpd=F)
plot(d$x, d$y * f, col='grey', xlab=expression(paste("Explained variance per gene ", R^2)), main="", lwd=2, type="l", ylab="", xlim=c(0,1), cex.axis=1.2, cex.lab=1.5, bty="n")
title(ylab="Density", line=2.5, cex.lab=1.5)
d <- density(Rgenetics, bw=1e-3)
r <- min(Rgenetics[p.adjust(F.p.value,"BH")<0.01]) ######## threshold to select 412 genes
x0 <- which(d$x>r)
polygon(d$x[c(x0[1],x0)], c(0,d$y[x0])* f, col=paste(set1[1],"44",sep=""), border=NA)
lines(d$x, d$y* f, col=set1[1], lwd=2)
text(d$x[x0[1]], d$y[x0[1]]*f +20, pos=4, paste(sum(Rgenetics > r), "genes q < 0.01"))
legend("topright", bty="n", col=c(set1[1], "grey"), lty=1, c("Observed","Random"), lwd=2)
glmPrediction <- glm$coefficients %*% t(design)
rlmPrediction <- rlm$coefficients %*% t(design)
Print signficiant genes
kk<-as.data.frame((p.adjust(F.p.value,"BH")<0.01))
kk$gene<- rownames(kk)
colnames(kk)[1]<-"code"
kk2<-kk[kk$code=="TRUE",]
### sort(kk2$gene) ##### if you want to print the entire list of differentially expressed genes
Extract the list of differentially expressed genes by mutations
### customize colors in colMutations
colMutations = col2hex(c("magenta", "purple","gray60","red","lightblue","green","orange"))
names(colMutations) <- colnames(design)[-1]
gene_code<- kk2$gene
tab=NULL
for(i in (1:length(kk2$gene)))
{
gene_single<- gene_code[i]
y <- glm$coefficients[gene_single,-1]+glm$coefficients[gene_single,1]
w <- glm$p.value[gene_single,-1] < 0.01
int<-c(gene_single, as.character(w))
tab<- rbind(tab, int)
}
rownames(tab)<-seq(1:nrow(tab))
colnames(tab)<- c("gene",colnames(design)[-1])
# Write to disk a file with all significant genes
#write.table(tab, "table_differentially_expressed_gene.txt",sep="\t", quote=F, row.names = F, col.names = T)
Example of extraction
par(mfrow=c(1,1))
par(mar=c(10,8,5,5), xpd=F)
par(bty="n", mgp = c(1.5,.33,0),las=1, tcl=-.25, xpd=F)
temp_name<- "YAP1"
plot(glmPrediction[gene_single,], geneExpr[gene_single,rownames(design)], ylab="", xlab="",
pch=16, cex=1, cex.axis=1.2, cex.lab=1.5)
title(ylab=(paste("Observed ",temp_name, " expression")), line=2.5, cex.lab=1.5)
title( xlab=(paste("Predicted ",temp_name, " expression")), line=2.5, cex.lab=1.5)
abline(0,1)
u <- par("usr")
par(xpd=NA)
y <- glm$coefficients[gene_single,-1]+glm$coefficients[gene_single,1]
u <- par("usr")
x0 <- rep(u[3]+1,ncol(design)-1)
y0 <- u[4] + 0.05*(u[4]-u[3]) - rank(-y)/length(y) * (u[4]-u[3])/1.2
d <- density(y)
lines(d$x, d$y/5+1+u[3], col="grey")
lines(d$x, -d$y/5+1+u[3], col="grey")
points(x=y, y=x0+violinJitter(y, magnitude=0.25)$y, col=colMutations, pch=16, cex=1.5)
text(x=glm$coefficients[gene_single,1], y= 5.2, "Model coefficients", cex=0.8)
legend("topleft",names(colMutations), col = colMutations, bty= "n", cex = 1.2, pch = 16)
Plot significant effects per covariate (q<0.05)
testResults <- decideTests(glm, method="hierarchical",adjust.method="BH", p.value=0.01)[,-1]
significantGenes <- sapply(1:ncol(testResults), function(j){
c <- glm$coefficients[testResults[,j]!=0,j+1]
table(cut(c, breaks=c(-5,seq(-1.5,1.5,l=7),5)))
})
colnames(significantGenes) <- colnames(testResults)
rownames(tab)<-c(1:nrow(tab))
tab2<- as.data.frame(tab)
tab2$gene<-as.character(as.character(tab2$gene))
tab2$final.molec<-as.character(as.character(tab2$final.molec))
tab2$TET2<-as.character(as.character(tab2$TET2))
tab2$RHOA<-as.character(as.character(tab2$RHOA))
tab2$IDH2<-as.character(as.character(tab2$IDH2))
tab2$DNMT3A<-as.character(as.character(tab2$DNMT3A))
par(mfrow=c(1,1))
par(bty="n", mgp = c(2.5,.33,0), mar=c(5,5.5,5,0)+.1, las=2, tcl=-.25)
b <- barplot(significantGenes, las=2, ylab = "Differentially expressed genes", col=brewer.pal(8,"RdYlBu"), legend.text=FALSE , border=0, xaxt="n", cex.lab=1.5)#, col = set1[simple.annot[names(n)]], border=NA)
rotatedLabel(x0=b-0.1, y0=rep(-0.5, ncol(significantGenes)), labels=colnames(significantGenes), cex=1.2, srt=45, font=ifelse(grepl("[[:lower:]]", colnames(design))[-1], 1,3), col=colMutations)
rotatedLabel(b-0.1, colSums(significantGenes), colSums(significantGenes), pos=3, cex=, srt=45)#dev.off()
clip(0,30,0,1000)
x0 <- 7.5
image(x=x0+c(0,0.8), y=par("usr")[4]+seq(-1,1,l=9) -4, z=matrix(1:8, ncol=8), col=brewer.pal(8,"RdYlBu"), add=TRUE)
text(x=x0+1.1, y=par("usr")[4]+c(-1,0,1) -4, format(seq(-1,1,l=3),2), cex=0.66)
lines(x=rep(x0+0.9, 2), y=par("usr")[4]+c(-1,1) -4)
segments(x0+0.9,par("usr")[4] + 1-4,x0+0.95,par("usr")[4] + 1-4)
segments(x0+0.9,par("usr")[4] + 0-4,x0+0.95,par("usr")[4] + 0-4)
segments(x0+0.9,par("usr")[4] + -1-4,x0+0.95,par("usr")[4] + -1-4)
text(x0 + 0.45, par("usr")[4] + 1.5-4, "log2 FC", cex=.66)
Print the list of differently expressed genes using the Ensembl annotation
select_hist<- pts.info.data[pts.info.data$final.molec == "AITL" | pts.info.data$final.molec == "PTCL.nos",]
gene<- as.data.frame(testResults)
sig_genes<- gene[gene$final.molec!= 0 |gene$IDH2 != 0 | gene$TET2 != 0 | gene$DNMT3A != 0 | gene$RHOA != 0,]
list_genes<-sort(rownames(sig_genes)) ##### list of signficiant genes
geneannotation1 <- getBM( attributes = c("ensembl_transcript_id", "entrezgene", "external_gene_name"), filters = "entrezgene", values = gsub("_at","",list_genes), mart = ensembl)
sort(unique(geneannotation1$external_gene_name))
## [1] "ADRA2A" "ARHGEF10" "C3" "COL4A4" "DZIP1" "EFNB2"
## [7] "HS3ST3A1" "ID2" "NETO2" "OSMR" "PRRX1" "ROBO1"
## [13] "SLC5A3" "XKR4" "YAP1"
Generate a heatmap with AITL, PTCL-NOS with the extracted differentially expressed genes.
gep<- geneExpr[,select_hist$sample.nameNEW]
mat<- gep[list_genes,]
rownames(mat) = c("AL441992.1",unique(geneannotation1$external_gene_name))
mycol= c("red","white","yellow")
mylabel = select_hist[,c("sample.nameNEW","final.molec","IDH2","RHOA","TET2","DNMT3A")]
rownames(mylabel) = mylabel$sample.nameNEW
mylabel$sample.nameNEW = NULL
mylabel.nocol = mylabel
mylabel.col = mylabel
mylabel.col[is.na(mylabel.col)]<-0
#head(mylabel.col)
mylabel.col$final.molec[mylabel.col$final.molec == "AITL"] = "black"; mylabel.col$final.molec[mylabel.col$final.molec == "PTCL.nos"] = "orange"
for (a in 2:5) mylabel.col[,a] = factor(mylabel.col[,a], levels = levels(as.factor(mylabel.col[,a])), labels = mycol )
mat <- mat - rowMeans(mat)
par(mfrow=c(1,1))
pheatmap(mat, annotation_col = mylabel.nocol, annotation_colors = list(final.molec = c(AITL = "black", PTCL.nos = "orange"), filename= "x.pdf",
IDH2 = c(MUT=mycol[1],"NA"=mycol[2],WT=mycol[3]),
RHOA = c(MUT=mycol[1],"NA"=mycol[2],WT=mycol[3]),
TET2 = c(MUT=mycol[1],"NA"=mycol[2],WT=mycol[3]),
DNMT3A = c(MUT=mycol[1],"NA"=mycol[2],WT=mycol[3]) ) , show_colnames = F, cellheight = 25,
border_color= NA, color = colorRampPalette(rev(brewer.pal(n = 5 , name = "RdYlBu")))(100), scale = "row", clustering_method = "ward.D2",clustering_distance_cols = "euclidean" , silent = F)
Use ConsensusClusterPlus to extract most significant clusters
gep<- geneExpr[,select_hist$sample.nameNEW]
mat<- gep[list_genes,]
title=tempdir()
d<- data.matrix(mat)
d = sweep(d,1, apply(d,1,median,na.rm=T))
results = ConsensusClusterPlus(d,maxK=8,
pFeature=1,
title=title,
clusterAlg="hc",
innerLinkage="ward.D2",
finalLinkage="ward.D2",
distance="euclidean",
seed=123456789)
## end fraction
## clustered
## clustered
## clustered
## clustered
## clustered
## clustered
## clustered
kk<- as.data.frame((results[[4]]$consensusClass)) ##### 4 significant cluster
kk$geo.id<- rownames(kk)
colnames(kk)[1]<- "cluster"
table(kk$cluster)
##
## 1 2 3 4
## 69 107 23 72
Plot heatmap according to the 4-cluster stratification.
## add ConsensusCluster label, samples orderedaccordingly
heat<- merge(t(mat), kk, by.x = 0, by.y="geo.id")
heat2<- merge(heat, pts.info.data, by.x = 1, by.y="sample.nameNEW")
heat2<- heat2[order(heat2$cluster),]
mycol= c("red","white","yellow")
mylabel = heat2[,c("Row.names","cluster","final.molec","TET2","RHOA","IDH2","DNMT3A")]
colnames(mylabel)<- c("sample.names","clusters","Histology","TET2","RHOA","IDH2","DNMT3A")
rownames(mylabel) = mylabel$sample.names
mylabel$sample.names = NULL
mylabel.nocol = mylabel
mylabel.col = mylabel
mylabel.col[is.na(mylabel.col)]<-0
#head(mylabel.col)
mylabel.col$Histology[mylabel.col$Histology == "AITL"] = "black"; mylabel.col$Histology[mylabel.col$Histology == "PTCL.nos"] = "orange"
#mylabel.col$Histology[mylabel.col$Histology == "ALCL.neg"] = "yellow"
for (a in c(3:6)) mylabel.col[,a] = factor(mylabel.col[,a], levels = levels(as.factor(mylabel.col[,a])), labels = mycol )
mycol_plus<- c(brewer.pal(11,"Paired"),brewer.pal(6,"Set2"))
for (a in 1) mylabel.col[,a] = factor(mylabel.col[,a], levels = levels(as.factor(mylabel.col[,a])), labels = mycol_plus[1:4] )
mylabel.nocol$clusters<-as.numeric(as.character(mylabel.nocol$clusters))
mylabel.nocol$clusters<-as.character(paste("cluster",mylabel.nocol$clusters, sep=""))
# plot heatmap
par(mfrow=c(1,1))
mat3<- t(data.matrix(heat2[,2:17]))
colnames(mat3)<-heat2$Row.names
mat3= mat3[order(rownames(mat3)),]
temp_name = getBM( attributes = c("ensembl_transcript_id", "entrezgene", "external_gene_name"), filters = "entrezgene", values = gsub("_at","",rownames(mat3)), mart = ensembl)[,c(2:3)]
temp_name = temp_name[!duplicated(temp_name[,1]),]
rownames(mat3) = c("AL441992.1",temp_name [,2] )
mat3 <- mat3 - rowMeans(mat3)
pheatmap(mat3, annotation_col = mylabel.nocol, annotation_colors = list(clusters = c(cluster1= mycol_plus[1], cluster2 = mycol_plus[2], cluster3 = mycol_plus[3], cluster4 = mycol_plus[4]), Histology = c(AITL = "black", PTCL.nos = "orange"), IDH2 = c(MUT=mycol[1], "NA"=mycol[2],WT=mycol[3]), RHOA = c(MUT=mycol[1],"NA"=mycol[2],WT=mycol[3]), TET2 = c(MUT=mycol[1],"NA"=mycol[2],WT=mycol[3]), DNMT3A = c(MUT=mycol[1],"NA"=mycol[2],WT=mycol[3]) ) , border_color= NA, color = colorRampPalette(rev(brewer.pal(n = 5 , name = "RdYlBu")))(100), scale = "row", cluster_cols = FALSE, show_colnames= F)
Analyze sample stratification based on the extracted differentially expressed genes betwee AILT and PTCL-nos and the ALCL ALK-negative 3-gene model.
select_hist<- pts.info.data[pts.info.data$final.molec == "AITL" | pts.info.data$final.molec == "PTCL.nos" | pts.info.data$final.molec == "ALCL.neg",]
# Add three classifier genes for ALCL ALK-neg [Agnelli et al, Blood, 2012]
# Check on array
anaplastic_gene<- c("TNFRSF8","BATF3","TMOD1")
geneannotation2 <- getBM( attributes = c("entrezgene", "external_gene_name"), filters = "external_gene_name", values = anaplastic_gene, mart = ensembl )
anaplastic_gene_ARRAY<- paste0(geneannotation2$entrezgene, "_at")
# Append 16-gene model to 3-gene model
list_genes_all<- c(list_genes, anaplastic_gene_ARRAY)
# Redo consensus cluster analysis
gep<- geneExpr[,select_hist$sample.nameNEW]
mat<- gep[list_genes_all,]
title=tempdir()
d<- data.matrix(mat)
d = sweep(d,1, apply(d,1,median,na.rm=T))
results = ConsensusClusterPlus(d,maxK=8,
pFeature=1,
title=title,
clusterAlg="hc",
innerLinkage="ward.D2",
finalLinkage="ward.D2",
distance="euclidean",
seed=123456789)
## end fraction
## clustered
## clustered
## clustered
## clustered
## clustered
## clustered
## clustered
kk_5<- as.data.frame((results[[5]]$consensusClass)) ##### 4 significant cluster
kk_5$geo.id<- rownames(kk_5)
colnames(kk_5)[1]<- "cluster"
table(kk_5$cluster)
##
## 1 2 3 4 5
## 87 103 32 63 55
Plot heatmap AITL, PTCL-NOS, ALCL-neg and the 19-gene model
heat<- merge(t(mat), kk_5, by.x = 0, by.y="geo.id")
heat2<- merge(heat, pts.info.data, by.x = 1, by.y="sample.nameNEW")
heat2<- heat2[order(heat2$cluster),]
mycol= c("red","white","yellow")
mylabel = heat2[,c("Row.names","cluster","final.molec","TET2","RHOA","IDH2","DNMT3A")]
colnames(mylabel)<- c("sample.names","clusters","Histology","TET2","RHOA","IDH2","DNMT3A")
rownames(mylabel) = mylabel$sample.names
mylabel$sample.names = NULL
mylabel.nocol = mylabel
mylabel.col = mylabel
mylabel.col[is.na(mylabel.col)]<-0
#head(mylabel.col)
mylabel.col$Histology[mylabel.col$Histology == "AITL"] = "black"; mylabel.col$Histology[mylabel.col$Histology == "PTCL.nos"] = "orange"; mylabel.col$Histology[mylabel.col$Histology == "ALCL.neg"] = "yellow"
for (a in c(3:6)) mylabel.col[,a] = factor(mylabel.col[,a], levels = levels(as.factor(mylabel.col[,a])), labels = mycol )
mycol_plus<- c(brewer.pal(11,"Paired"),brewer.pal(6,"Set2"))
for (a in 1) mylabel.col[,a] = factor(mylabel.col[,a], levels = levels(as.factor(mylabel.col[,a])), labels = mycol_plus[1:5] )
mylabel.nocol$clusters<-as.numeric(as.character(mylabel.nocol$clusters))
mylabel.nocol$clusters<-as.character(paste("cluster",mylabel.nocol$clusters, sep=""))
par(mfrow=c(1,1))
par(mar=c(5,5,5,5), xpd=F)
mat3<- t(data.matrix(heat2[,2:20]))
colnames(mat3)<-heat2$Row.names
mat3= mat3[order(rownames(mat3)),]
temp_name = getBM( attributes = c("ensembl_transcript_id", "entrezgene", "external_gene_name"), filters = "entrezgene", values = gsub("_at","",rownames(mat3)), mart = ensembl)[,c(2:3)]
temp_name = temp_name[!duplicated(temp_name[,1]),]
rownames(mat3) = c("AL441992.1",temp_name [,2] )
mat3 <- mat3 - rowMeans(mat3)
par(mfrow=c(1,1))
pheatmap(mat3, annotation_col = mylabel.nocol, annotation_colors = list(clusters = c(cluster1= mycol_plus[1], cluster2 = mycol_plus[2], cluster3 = mycol_plus[3], cluster4 = mycol_plus[4], cluster5 = mycol_plus[5]), Histology = c(AITL = "black", PTCL.nos = "orange", ALCL.neg= "yellow"), IDH2 = c(MUT=mycol[1], "NA"=mycol[2],WT=mycol[3]), RHOA = c(MUT=mycol[1],"NA"=mycol[2],WT=mycol[3]), TET2 = c(MUT=mycol[1],"NA"=mycol[2],WT=mycol[3]), DNMT3A = c(MUT=mycol[1],"NA"=mycol[2],WT=mycol[3]) ) , border_color= NA, color = colorRampPalette(rev(brewer.pal(n = 5 , name = "RdYlBu")))(100), scale = "row", cluster_cols = FALSE, show_colnames= F, row_annotation =3, cellheight = 20)
LOOCV on AILT, ALCL neg, PTCLnos based on 19-gene model
y = t(mat3)
cl.orig = c()
for (u in 1:nrow(y)) cl.orig [u] = unlist(strsplit(rownames(y)[u],"\\."))[1]
perm.mother = rownames(y)
perm.son = combn (perm.mother, length(perm.mother)-1)
output <- cbind(perm.mother, NA)
for (i in 1:length(perm.mother)) {
train <- y [ perm.son[,i], ]
test <- y [ ! ( rownames(y) %in% perm.son[,i]) , ]
cl <- cl.orig [which(rownames(y)%in%perm.son[,i])]
z <- lda(train, cl)
p <- predict(z,test)$class
output [ setdiff(1:340, which( rownames(y) %in% perm.son[,i]) ) , 2 ] = as.character(p)
# output [ output[,1] == rownames(test) , 3 ] = z$scaling [1,1]
# output [ output[,1] == rownames(test) , 4 ] = z$scaling [2,1]
# output [ output[,1] == rownames(test) , 5 ] = z$scaling [3,1]
}
colnames(output) = c("true","LOOCV.predicted")
output = as.data.frame(output)
output$true.class = cl.orig
table(output$true.class, output$LOOCV.predicted )
##
## AITL ALCL PTCL
## AITL 109 1 17
## ALCL 5 44 20
## PTCL 13 11 120
Cibersort algorithm to characterize the tumour and microenviroment composition of each cluster
##### cibersort and origical molecular histologies
cibersort.percentages<- read.delim("cibersort.percentages.txt", sep="\t", stringsAsFactors = F, header = T)
ciber_all<-as.data.frame.matrix(t(cibersort.percentages))
ciber_all$sample.nameNEW <- rownames(ciber_all)
colnames(kk_5)[2]<-"sample.nameNEW"
require(plyr)
## Loading required package: plyr
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:Hmisc':
##
## is.discrete, summarize
## The following object is masked from 'package:IRanges':
##
## desc
## The following object is masked from 'package:S4Vectors':
##
## rename
final <-join(ciber_all, kk_5, by = "sample.nameNEW", type="left")
final2<-merge(pts.info.data[,c(1,6,14:17)], final, by="sample.nameNEW")
final3<- subset(final2, final.molec %in% c("AITL","ALCL.neg","ALCL.pos","ATLL","NKT","PTCL.nos"))
final3<- final3[order(final3$final.molec),]
library(RColorBrewer)
n <- 22
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
par(mar=c(2,5,7,10), xpd=TRUE)
x<- barplot(t(final3[7:28]), names.arg = rep("", length(final3$final.molec)), cex.names = 0.7, col=col_vector, border=NA,
space=rep(0, nrow(final3)))
legend("topright",legend=colnames(final3)[7:28], col=col_vector, pch=c(15), inset=c(-0.11,0), pt.cex= 1,
cex = 1, bty = "n", x.intersp = 0.7)
names_hist<- unique(final3$final.molec)
col_hist<- c("orange","yellow","dodgerblue2","brown2","darkorchid1","black")
num<- as.numeric(table(final3$final.molec))
for(i in (1:length(num)))
{
segments(x[sum(num[1:i])+1-num[i]], 1.05,x[sum(num[1:i])],1.05,lwd=4, col=col_hist[i])
text(x[(sum(num[1:i])-num[i] +1+ sum(num[1:i]))/2], 1.1, names_hist[i], cex=1.2, srt=0)
}
####################################################################################################
####
#### devided for histoogy and clusters
####
####################################################################################################
for(i in (1:nrow(final3)))
{
final3$cluster[i][is.na(final3$cluster[i])]<- final3$final.molec[i]
}
final3$cluster <- factor(final3$cluster, levels = c( "1","2","4","NKT","3","5","ALCL.pos", "ATLL"))
final3<- final3[order(final3$cluster),]
par(mar=c(2,5,7,10), xpd=TRUE)
x<- barplot(t(final3[7:28]), names.arg = rep("", length(final3$final.molec)), cex.names = 0.7, col=col_vector, border=NA,
space=rep(0, nrow(final3)))
legend("topright",legend=colnames(final3)[7:28], col=col_vector, pch=c(15), inset=c(-0.11,0), pt.cex= 1,
cex = 1, bty = "n", x.intersp = 0.7)
mycol_plus<- c(brewer.pal(11,"Paired"),brewer.pal(6,"Dark2"))
names_hist<- c("C-1","C-2", "C-4","NKT","C-3","C-5","ALCL.pos","ATLL")
col_hist<- c(mycol_plus[1],mycol_plus[2],mycol_plus[4],"darkorchid1",mycol_plus[3],mycol_plus[5],"dodgerblue2","brown2","")
num<- as.numeric(table(final3$cluster))
par(new=TRUE)
for(i in (1:(length(num))))
{
segments(x[sum(num[1:i])+1-num[i]], 1.05,x[sum(num[1:i])],1.05,lwd=4, col=col_hist[i])
text(x[(sum(num[1:i])-num[i] +1+ sum(num[1:i]))/2], 1.1, names_hist[i], cex=1.2, srt=0)
}
annotation_col<- final3[,c("TET2","RHOA","IDH2","DNMT3A","final.molec","cluster")]
annotation_col[is.na(annotation_col)]<-0
rownames(annotation_col)<- final3$sample.nameNEW
colnames(annotation_col)[5]<-c("Hist")
A <- function(x) (as.factor(as.character(x))) ##### lapply function for all columns to generate the relative contribution
annotation_col[,1:ncol(annotation_col)] = apply(annotation_col[,1:ncol(annotation_col)], 2, function(x) as.factor(as.character(x)))
annotation_col<- as.data.frame(annotation_col)
mycol_plus<- c(brewer.pal(11,"Paired"),brewer.pal(6,"Dark2"))
color.annot<- as.character(color.annot)
ann_colors = list(Hist=c( "AITL"=color.annot[1],"ALCL.neg"=color.annot[2],"PTCL.nos"=color.annot[6], "NKT" =color.annot[5] ,
"ALCL.pos" =color.annot[3],"ATLL"=color.annot[4]),
cluster=c("1" = mycol_plus[1],"2" = mycol_plus[2],"4" = mycol_plus[4],"NKT" = color.annot[5],
"3" = mycol_plus[3],"5" = mycol_plus[5], "ALCL.pos" = color.annot[3],"ATLL" =color.annot[4] ),
DNMT3A = c("MUT"="red","WT"= "yellow2","0"="grey96"),
IDH2 = c("MUT"="red","WT"= "yellow2","0"="grey96"),
RHOA = c("MUT"="red","WT"= "yellow2","0"="grey96"),
TET2 = c("MUT"="red","WT"= "yellow2","0"="grey96")
)
edata<- as.matrix(((final3[,c(7:28)])))
rownames(edata)<-final3$sample.nameNEW
library(pheatmap)
pheatmap(as.matrix( t(edata)), annotation_col=annotation_col, annotation_colors = ann_colors,
breaks = c(seq(0, 0.1, by= 0.001), seq(0.101, 0.2, by= 0.005),seq(0.21, 1, by= 0.01 )) , color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(204), cellheight = 20,
cluster_cols = F, border_color="NA", show_colnames = F)
##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### #####
#####
#####
##### focus the analysis on AITL, PTCL-NOS and ALCL-neg
#####
#####
##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### #####
load("cibersort.all.Rdata")
ciber<-as.data.frame.matrix(t(cibersort.percentages))
ciber_select<-ciber[rownames(kk_5),]
ciber_select$sample.nameNEW<- rownames(ciber_select)
final<- merge(ciber_select, kk_5, by="sample.nameNEW")
final2<- merge(final, pts.info.data, by="sample.nameNEW")
final2<- final2[order(final2$cluster),]
annotation_col<- final2[,c("TET2","RHOA","IDH2","DNMT3A","final.molec","cluster")]
annotation_col[is.na(annotation_col)]<-0
rownames(annotation_col)<- final2$sample.nameNEW
colnames(annotation_col)[5]<-c("Hist")
A <- function(x) (as.factor(as.character(x))) ##### lapply function for all columns to generate the relative contribution
annotation_col[,1:ncol(annotation_col)] = apply(annotation_col[,1:ncol(annotation_col)], 2, function(x) as.factor(as.character(x)))
annotation_col<- as.data.frame(annotation_col)
mycol_plus<- c(brewer.pal(11,"Paired"),brewer.pal(6,"Dark2"))
mycol_plus<- c(brewer.pal(11,"Paired"),brewer.pal(6,"Set2"))
color.annot<- as.character(color.annot)
ann_colors = list(Hist=c( "AITL"=color.annot[1],"ALCL.neg"=color.annot[2],"PTCL.nos"=color.annot[6]),
cluster=c("1" = mycol_plus[1],"2" = mycol_plus[2],"3" = mycol_plus[3],"4" = mycol_plus[4],"5" = mycol_plus[5]),
DNMT3A = c("MUT"="red","WT"= "yellow2","0"="grey96"),
IDH2 = c("MUT"="red","WT"= "yellow2","0"="grey96"),
RHOA = c("MUT"="red","WT"= "yellow2","0"="grey96"),
TET2 = c("MUT"="red","WT"= "yellow2","0"="grey96")
)
edata<- as.matrix(((final2[,c(2:23)])))
rownames(edata)<-final2$sample.nameNEW
library(pheatmap)
pheatmap(as.matrix( t(edata)), annotation_col=annotation_col, annotation_colors = ann_colors,
breaks = c(seq(0, 0.1, by= 0.001), seq(0.101, 0.2, by= 0.005),seq(0.21, 1, by= 0.01 )) , color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(204), cellheight = 15,
cluster_cols = F, border_color="NA")
Boxplot comparing the contribution of each cibersort signature between all extracted clusters
par(mfrow=c(1,2))
par(mar=c(3,3,3,3), xpd=F)
for(i in (2:23))
{
k<- as.numeric(final2[,i])
table_wilk<- pairwise.wilcox.test(k,final2$cluster,p.adjust.methods = "bonferroni" )$p.value
df_wilk <- data.frame(expand.grid(dimnames(table_wilk)),array(table_wilk))
df_wilk2<-na.omit(df_wilk)
df_wilk2_sig<- df_wilk2[df_wilk2$array.table_wilk.<0.05,]
df_wilk2_sig$Var1<-as.numeric(as.character(df_wilk2_sig$Var1))
df_wilk2_sig$Var2<-as.numeric(as.character(df_wilk2_sig$Var2))
if(nrow(df_wilk2_sig)>0)
{
boxplot(k~final2$cluster, ylim=c(0,(max(k)+0.2)), main=colnames(final2)[i], cex.main=2, col=mycol_plus, las=2)
for(j in (1:nrow(df_wilk2_sig)))
{
segments(df_wilk2_sig$Var1[j], max(k)-0.01+j/30, df_wilk2_sig$Var2[j],max(k)-0.01+j/30)
p<-df_wilk2_sig$array.table_wilk.[j]
if(p<0.00001){p2 = "<0.00001"}else{
p2<-as.numeric(formatC(p,digits=6,format="f"))}
pval <- paste("p =",p2,sep=" ")
text((df_wilk2_sig$Var1[j]+ df_wilk2_sig$Var2[j])/1.9, max(k) +j/30, pval, cex=0.8)
}
}
}
final<- read.delim("aitl_nos_alcl_clsutering.txt",sep="\t",header = T,stringsAsFactors = F)
final2<- final[,c("Row.names","hist","cluster")]
mat<- read.delim("ensembl_annotated_matrix.txt", sep="\t", stringsAsFactors = F) #### ensembl annotated gep marrix
design <- model.matrix(~ 0+factor(final2$cluster)) ##### create matrix
colnames(design)<-paste0("Cluster_",c(1:5))
contrast.matrix <- makeContrasts(Cluster_2-Cluster_1, Cluster_3-Cluster_1,Cluster_4-Cluster_1, Cluster_5-Cluster_1,
Cluster_3-Cluster_2, Cluster_4-Cluster_2, Cluster_5-Cluster_2,
Cluster_4-Cluster_3, Cluster_5-Cluster_3,
Cluster_4-Cluster_5,
Cluster_2-(Cluster_1 + Cluster_3 + Cluster_4 + Cluster_5)/4,
Cluster_3-(Cluster_1 + Cluster_2 + Cluster_4 + Cluster_5)/4,
Cluster_4-(Cluster_1 + Cluster_2 + Cluster_3 + Cluster_5)/4,
Cluster_1-(Cluster_2 + Cluster_3 + Cluster_4 + Cluster_5)/4,
Cluster_5-(Cluster_2 + Cluster_3 + Cluster_4 + Cluster_1)/4,
levels=design)
fit1 <- lmFit(mat, design)
fit2 <- contrasts.fit(fit1, contrast.matrix)
fit <- eBayes(fit2)
Supervised analysis between clusters
geneExpr = adj.data
geneExpr2<- geneExpr[,colnames(geneExpr) %in% final2$Row.names ]
geneExpr2<- geneExpr2[,final2$Row.names]
ensembl = useMart( "ensembl", dataset = "hsapiens_gene_ensembl" )
hgnc_swissprot <- getBM(attributes=c('entrezgene','hgnc_symbol','hgnc_id'),filters = 'entrezgene',
values = gsub("_at","",rownames(geneExpr2)),mart = ensembl)
geneExpr3<- as.data.frame.matrix(geneExpr2[which(rownames(geneExpr2) %in% paste0(hgnc_swissprot$entrezgene,"_at")),])
levels_design<- c("Cluster_2-Cluster_1","Cluster_3-Cluster_1","Cluster_4-Cluster_1","Cluster_5-Cluster_1",
"Cluster_3-Cluster_2","Cluster_4-Cluster_2","Cluster_5-Cluster_2","Cluster_4-Cluster_3",
"Cluster_5-Cluster_3","Cluster_4-Cluster_5",
"Cluster_2-(Cluster_1 + Cluster_3 + Cluster_4 + Cluster_5)/4",
"Cluster_3-(Cluster_1 + Cluster_2 + Cluster_4 + Cluster_5)/4",
"Cluster_4-(Cluster_1 + Cluster_2 + Cluster_3 + Cluster_5)/4",
"Cluster_1-(Cluster_2 + Cluster_3 + Cluster_4 + Cluster_5)/4",
"Cluster_5-(Cluster_2 + Cluster_3 + Cluster_4 + Cluster_1)/4")
df_diff_all=NULL
for(i in (1:length(levels_design)))
{
tt <- topTable(fit, coef=i, number=Inf, genelist=rownames(geneExpr3))
tt$ID<- rownames(tt)
colnames(tt)[1]<-"GENE_SYMBOL"
head(tt, 10)
fg <- tt$GENE_SYMBOL[tt$adj.P.Val < 0.001 & abs( tt$logFC ) > 2]
length(fg)
df_diff<- cbind(fg, rep(levels_design[i], length(fg)))
df_diff_all<-rbind(df_diff_all, df_diff)
#plot(tt$logFC, -log10(tt$adj.P.Val))
}
df_diff_all<- as.data.frame.matrix(df_diff_all)
annotation_col<- final2
colnames(annotation_col)<-c("sampleID","Hist","cluster")
A <- function(x) (as.factor(as.character(x))) ##### lapply function for all columns to generate the relative contribution
annotation_col[,1:ncol(annotation_col)] = apply(annotation_col[,1:ncol(annotation_col)], 2, function(x) as.factor(as.character(x)))
annotation_col<- as.data.frame(annotation_col[,-1])
mycol_plus<- c(brewer.pal(11,"Paired"),brewer.pal(6,"Dark2"))
ann_colors = list(Hist=c( "AITL"="black","ALCL"="yellow","PTCL"="orange"),
cluster=c("1" = mycol_plus[1],"2" = mycol_plus[2],"3" = mycol_plus[3],"4" = mycol_plus[4],"5" =mycol_plus[5])
)
edata3<- mat[rownames(mat) %in% unique(df_diff_all$fg),]
pheatmap(as.matrix( as.matrix(edata3)), annotation_col=annotation_col, annotation_colors = ann_colors, border_color="NA",
scale = "row", cluster_cols = FALSE, show_colnames= F, show_rownames = FALSE)
############ table of genes
df_diff_all_tab=NULL
for(i in (1:length(levels_design)))
{
tt <- topTable(fit, coef=i, number=Inf, genelist=rownames(geneExpr3))
tt$ID<- rownames(tt)
colnames(tt)[1]<-"GENE_SYMBOL"
head(tt,10)
fg <- tt[tt$adj.P.Val < 0.001 & abs( tt$logFC ) > 2,]
if(nrow(fg)>0){
fg$design<- levels_design[i]
df_diff_all_tab<-rbind.data.frame(df_diff_all_tab, fg)
#plot(tt$logFC"," -log10(tt$adj.P.Val))
}
}
nrow(df_diff_all_tab) #### number of genes differentially expressed between C-1, C-2, C-3, C-4, C-5
## [1] 668
##### list gene from Iqbal et al. blood 2014
iqbal<- unique(c("EFNB2","ROBO1","S1PR3","ANK2","LPAR1","SNAP91","SOX8","LPAR1","RAMP3","S1PR3","ROBO1","EFNB2","TUBB2B","SOX8","SOX8","ARHGEF10","DMRT1","SLC19A21","STK3","PERP","TNFRSF8","TMOD1","BATF3","CDC14B","PERP","WDFEY3","TMOD1","ATP6V0D1","AXL","CD59","CHI3L1","CLTC","COL6A1","CREG1","CTSB","CTSC","NR1","H3","PDXK","PITPNA","PLSCR1","PRDX3","CTSS","CYBB","FABP3","FPR1","FTL","GUCA2A","HCK","IFI30","IL13RA1","JAK2","LILRB1","PRKG1PSAP","SLC7A7","SOD2","TCN2","THY1","TYR","UBE2L6","WARS","AXL","FTL","SIRPA","STAT1","CSF2","IFNG","SEPT6","GATA3","CD28","STAT1","AXL","CD28","CD40","CD59","CSF2","FTL","IFNG","LILRB1","SIRPA","TBX21","MSH6","EGR1","CAT","EGR1","CAT"))
intersect(iqbal, unique(df_diff_all_tab$GENE_SYMBOL))
## [1] "ROBO1" "LPAR1" "SOX8" "TUBB2B" "TNFRSF8" "TMOD1"
## [7] "BATF3" "ATP6V0D1" "CHI3L1" "CREG1" "CTSB" "CTSC"
## [13] "FTL" "HCK"
Pairwise for pathway using tmod (https://cran.r-project.org/web/packages/tmod/vignettes/tmod.pdf)
fit1 <- lmFit(mat, design)
fit2 <- contrasts.fit(fit1, contrast.matrix)
fit <- eBayes(fit2)
res.l <- tmodLimmaTest(fit, rownames(mat))
length(res.l)
## [1] 15
names(res.l)
## [1] "Cluster_2 - Cluster_1"
## [2] "Cluster_3 - Cluster_1"
## [3] "Cluster_4 - Cluster_1"
## [4] "Cluster_5 - Cluster_1"
## [5] "Cluster_3 - Cluster_2"
## [6] "Cluster_4 - Cluster_2"
## [7] "Cluster_5 - Cluster_2"
## [8] "Cluster_4 - Cluster_3"
## [9] "Cluster_5 - Cluster_3"
## [10] "Cluster_4 - Cluster_5"
## [11] "Cluster_2 - (Cluster_1 + Cluster_3 + Cluster_4 + Cluster_5)/4"
## [12] "Cluster_3 - (Cluster_1 + Cluster_2 + Cluster_4 + Cluster_5)/4"
## [13] "Cluster_4 - (Cluster_1 + Cluster_2 + Cluster_3 + Cluster_5)/4"
## [14] "Cluster_1 - (Cluster_2 + Cluster_3 + Cluster_4 + Cluster_5)/4"
## [15] "Cluster_5 - (Cluster_2 + Cluster_3 + Cluster_4 + Cluster_1)/4"
pie <- tmodLimmaDecideTests(fit, genes=rownames(mat))
par(mfrow=c(1,1))
res.l2<- lapply(res.l, function(x) {x[x$adj.P.Val<10e-8,]})
tmodPanelPlot(res.l2, pie=pie, text.cex=0.6) ##### zero = grey, blue down in the first factor and red up in the first
res.l2<- lapply(res.l, function(x) {x[x$adj.P.Val>10e-8 & x$adj.P.Val<10e-5,]})
tmodPanelPlot(res.l2, pie=pie, text.cex=0.6) ##### zero = grey, blue down in the first factor and red up in the first